In [1]:
from IPython.core.display import display, HTML
display(HTML("""<style> .container {width:96% !important;}</style>"""))

from IPython.display import IFrame
In [2]:
import pandas as pd
import numpy as np
from __future__ import division
In [3]:
import xgboost as xgb
import sys
sys.path.insert(0,'../')
from utils.paths import *

Explain the grades

In [5]:
import pickle

with fs.open(path_SBA + 'clf_xgb.dat', 'rb') as fp_in:
    clf_xgb = pickle.load(fp_in)
    
with fs.open(path_SBA + 'bst_ex.dat', 'rb') as fp_in:
    bst_ex = pickle.load(fp_in)
    
with fs.open(path_SBA + 'dict_categorical.pkl', 'rb') as fp_in:
    dict_categorical = pickle.load(fp_in);        

result_table_proj = pd.read_csv(path_SBA + 'result_table_proj.csv', sep = ';', low_memory = False)
proj_bas = pd.read_csv(path_SBA + 'proj_bas.csv', sep = ';', low_memory = False)
nat5 = pd.read_csv(path_SBA + 'nat5.csv', sep = ';', low_memory = False)
In [6]:
import eli5

Overall contribution of the variables

(Similar to features important table)

In [7]:
eli5.show_weights(bst_ex, vec = dict_categorical, importance_type="weight")
Out[7]:
Weight Feature
0.1536 Zip5d_INT
0.1266 City_INT
0.0970 fips_INT
0.0768 Zip3d_INT
0.0730 Bank_INT
0.0663 NoEmp
0.0571 RetainedJob
0.0533 NAICS_group_INT
0.0472 NAICS_default_rate
0.0403 CreateJob
0.0391 State_INT
0.0371 suffix_INT
0.0360 BankState_INT
0.0152 BusinessType_INT
0.0128 UrbanRural
0.0117 Expanding_ratio_INT
0.0113 NewExist
0.0102 Retaining_ratio_INT
0.0094 Expanding
0.0062 Loan_age
… 6 more …
eli5.show_weights(bst_ex, vec = dict_categorical, importance_type="gain")eli5.show_weights(bst_ex, vec = dict_categorical, importance_type="cover")
In [10]:
result_table_proj.head()
Out[10]:
LoanNr_ChkDgt Name ApprovalFY State default ChgOffPrinGr GrAppv SBA_Appv SBA_ratio prob Grade
0 1075325003 Gilly's, LLC 2005 FL 0 0.0 10000.0 5000.0 0.5 0.130445 3
1 1075335006 Gerald Christiansen dba Big G 2005 UT 0 0.0 36000.0 18000.0 0.5 0.090886 2
2 1075345009 Steven Wilson 2005 KY 0 0.0 10000.0 5000.0 0.5 0.068558 2
3 1075355001 Duraport Marine And Rail Termi 2005 NJ 0 0.0 100000.0 50000.0 0.5 0.056876 2
4 1075365004 Rucinsky's Painting and Decora 2005 WI 0 0.0 225000.0 112500.0 0.5 0.040396 1
In [11]:
proj_bas.head()
Out[11]:
NoEmp NewExist CreateJob RetainedJob UrbanRural RealEstate NAICS_default_rate Loan_age Previous_loan default_times ... Bank_INT BankState_INT RevLineCr_INT LowDoc_INT NAICS_group_INT suffix_INT fips_INT BusinessType_INT Expanding_ratio_INT Retaining_ratio_INT
0 1 2.0 0 0 1 0 19.0 0 0 0 ... 194.0 28.0 1 0.0 6 16 305.0 0.0 1 0
1 3 1.0 0 0 2 0 23.0 0 0 0 ... 2500.0 45.0 1 0.0 4 20 2206.0 1.0 1 0
2 1 1.0 0 0 1 0 23.0 0 0 0 ... 1808.0 18.0 1 0.0 4 20 784.0 1.0 1 0
3 5 1.0 0 0 1 0 19.0 0 0 0 ... 194.0 40.0 1 0.0 12 20 1379.0 -1.0 1 0
4 1 1.0 0 0 1 1 29.0 0 0 0 ... 158.0 49.0 0 0.0 11 20 2460.0 0.0 1 0

5 rows × 26 columns

In [12]:
# Example of good companies
result_table_proj[(result_table_proj.default == 0) & (result_table_proj.Grade == 1)].sample(5, random_state=1)
Out[12]:
LoanNr_ChkDgt Name ApprovalFY State default ChgOffPrinGr GrAppv SBA_Appv SBA_ratio prob Grade
55707 8742894010 HEMP TRADERS 2005 CA 0 0.0 76500.0 65025.0 0.85 0.030161 1
67636 8970164005 ADVANCED MEDICAL PRODUCTS OF S 2005 FL 0 0.0 402000.0 402000.0 1.00 0.002414 1
54637 8718124002 BRIDGEPORT INN 2005 WI 0 0.0 628000.0 628000.0 1.00 0.013276 1
55022 8727904002 OPTIMAL FITNESS LLC 2005 CA 0 0.0 17500.0 8750.0 0.50 0.041170 1
72453 9058304006 WESTERN TECHNICAL SERVICES 2005 CA 0 0.0 350000.0 175000.0 0.50 0.039149 1
In [13]:
# Example of bad companies
result_table_proj[(result_table_proj.default == 1) & (result_table_proj.Grade == 5)].sample(5, random_state=1)
Out[13]:
LoanNr_ChkDgt Name ApprovalFY State default ChgOffPrinGr GrAppv SBA_Appv SBA_ratio prob Grade
25177 1382275001 Michele C. McAvoy dba Accent o 2005 CO 1 50043.0 75000.0 37500.0 0.50 0.247884 5
25838 1390445004 Touchwood Associates, Inc. 2005 GA 1 29912.0 30500.0 15250.0 0.50 0.609766 5
31026 1455825002 Fine Woods International LLC 2005 FL 1 100000.0 100000.0 50000.0 0.50 0.661664 5
58530 8802884010 JACKSON INDUSTRIAL EQUITY INC 2005 TX 1 8233.0 10000.0 8500.0 0.85 0.470765 5
31366 1460195008 Phillip Mark 2005 IN 1 9858.0 15000.0 7500.0 0.50 0.380532 5

Contributions from variables

In [14]:
# Example of good company
eli5.show_prediction(bst_ex, proj_bas.loc[55707], show_feature_values=True)
Out[14]:

y (score -3.471) top features

Contribution? Feature Value
+0.123 RealEstate 0.000
+0.067 BusinessType_INT 1.000
+0.025 RevLineCr_INT 0.000
+0.020 Expanding 0.000
+0.018 Retaining_ratio_INT 0.000
+0.015 suffix_INT 20.000
+0.012 State_INT 4.000
+0.003 Retaining 0.000
+0.001 NAICS_group_INT 6.000
+0.000 CreateJob 0.000
-0.004 Expanding_ratio_INT 1.000
-0.007 RetainedJob 0.000
-0.008 NewExist 1.000
-0.017 NAICS_default_rate 19.000
-0.018 default_times 0.000
-0.019 UrbanRural 1.000
-0.040 City_INT 4954.000
-0.049 LowDoc_INT 1.000
-0.116 NoEmp 3.000
-0.144 fips_INT 148.000
-0.152 BankState_INT 4.000
-0.209 Zip3d_INT 820.000
-0.229 Previous_loan 1.000
-0.236 Bank_INT 1729.000
-0.351 Zip5d_INT 14097.000
-0.498 Loan_age 6.000
-1.658 <BIAS> 1.000
In [15]:
# Example of bad companiy
eli5.show_prediction(bst_ex, proj_bas.loc[25177], show_feature_values=True)
Out[15]:

y (score -1.110) top features

Contribution? Feature Value
+0.264 Bank_INT 409.000
+0.188 RevLineCr_INT 0.000
+0.159 BusinessType_INT 1.000
+0.132 BankState_INT 46.000
+0.123 RealEstate 0.000
+0.096 State_INT 5.000
+0.082 suffix_INT 20.000
+0.070 NoEmp 1.000
+0.066 Expanding_ratio_INT 3.000
+0.061 City_INT -1.000
+0.048 LowDoc_INT 0.000
+0.025 RetainedJob 1.000
+0.021 Retaining_ratio_INT 2.000
+0.019 NewExist 1.000
+0.014 Loan_age 0.000
+0.011 Zip3d_INT 756.000
+0.009 Retaining 1.000
+0.008 Expanding 1.000
+0.007 Previous_loan 0.000
-0.008 default_times 0.000
-0.023 CreateJob 1.000
-0.089 NAICS_group_INT 16.000
-0.092 Zip5d_INT 13242.000
-0.097 fips_INT 233.000
-0.153 UrbanRural 2.000
-0.392 NAICS_default_rate 10.000
-1.658 <BIAS> 1.000
In [16]:
nat5.loc[25177][['Bank', 'BankState', 'fips', 'RealEstate', 'NAICS_group']]
Out[16]:
Bank           CAPITAL ONE NATL ASSOC
BankState                          VA
fips                             8107
RealEstate                          0
NAICS_group                        62
Name: 25177, dtype: object
In [17]:
nat5[nat5.default == 1].Bank.value_counts().head()
Out[17]:
BANK OF AMERICA NATL ASSOC     3624
CAPITAL ONE NATL ASSOC         2257
CITIZENS BANK NATL ASSOC       2248
BBCN BANK                      1881
WELLS FARGO BANK NATL ASSOC     983
Name: Bank, dtype: int64
In [18]:
nat5[nat5.default == 1].BankState.value_counts().head()
Out[18]:
NC    3553
CA    2951
VA    2794
RI    2454
SD    1044
Name: BankState, dtype: int64

Grouping the variables

In [19]:
var_group = {'Location': ['BankState_INT', 'fips_INT', 'UrbanRural', 'State_INT', 'Zip5d_INT', 'Zip3d_INT', 'City_INT'],
             'Bus. Size': ['RealEstate', 'NewExist', 'NoEmp', 'BusinessType_INT'],
             'Bus. Status': ['Expanding', 'Expanding_ratio_INT', 'Retaining', 'CreateJob', 'Retaining_ratio_INT', 'RetainedJob'],
             'Bus. Sector': ['NAICS_group_INT', 'NAICS_default_rate', 'suffix_INT'],
             'Loan quality': ['Bank_INT', 'LowDoc_INT', 'RevLineCr_INT'],
             'Past records': ['Loan_age', 'Previous_loan', 'default_times']
            }

def var_group_contribution(LOC, model = bst_ex, m_input = proj_bas, var_group = var_group):
    df = eli5.explain_prediction_df(model, m_input.loc[LOC])
    group_contribution = {}
    for group in var_group.keys():
        group_contribution[group] = df[df.feature.isin(var_group[group])].weight.sum()
    return group_contribution
In [20]:
# Example of good company
var_group_contribution(55707)
Out[20]:
{'Bus. Sector': -0.0009546008142414658,
 'Bus. Size': 0.06575683895465546,
 'Bus. Status': 0.031198092448428603,
 'Loan quality': -0.2599932012631341,
 'Location': -0.9033791485221936,
 'Past records': -0.7447348797019406}
In [21]:
# Example of bad company
var_group_contribution(25177)
Out[21]:
{'Bus. Sector': -0.39833487977406484,
 'Bus. Size': 0.371047833304924,
 'Bus. Status': 0.1057018346408255,
 'Loan quality': 0.49978055349471756,
 'Location': -0.04220114444881927,
 'Past records': 0.012544438933989842}

Creat graphical interface

In [22]:
from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objs as go
import cufflinks as cf
init_notebook_mode()
cf.go_offline()
In [23]:
def plot_var_group_contribution(LOC, model = bst_ex, m_input = proj_bas, var_group = var_group):
    var_group_con = var_group_contribution(LOC, model, m_input, var_group)
    _theta = var_group_con.keys() + [var_group_con.keys()[0]]
    data = [go.Scatterpolar(
            theta = _theta,
            r = [var_group_con[k] for k in _theta],
            fill = 'toself'
            )]
    layout = go.Layout(
      polar = dict(
        radialaxis = dict(
          visible = True,
          range = [-1, 1]
        )
      ),
      showlegend = False
    )
    fig = go.Figure(data=data, layout=layout)
    return iplot(fig)
In [24]:
# Example of good company
plot_var_group_contribution(55707)
In [25]:
# Example of bad company
plot_var_group_contribution(25177)